I started with preparing my DFM.
tot <- read.csv("clothing_reviews23.csv")
tot$text <- gsub("'"," ",tot$text)
myCorpus <- corpus(tot)
tok2 <- tokens(myCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
Dfm <- dfm(tok2 )
topfeatures(Dfm )
## dress love t size s top fit great like wear
## 2600 2299 1980 1900 1890 1805 1750 1709 1553 1434
Dfm <- dfm_remove(Dfm , min_nchar=2)
topfeatures(Dfm )
## dress love size top fit great like wear just fabric
## 2600 2299 1900 1805 1750 1709 1553 1434 1254 1175
Dfm <- dfm_trim(Dfm, min_termfreq = 5, verbose=TRUE)
## Removing features occurring:
## - fewer than 5 times: 4,986
## Total features removed: 4,986 (68.2%).
Then I applied the GloVe algorithm via Quanteda. I also weighted out
FCM.
Dfm_vocab <- featnames(Dfm )
str(Dfm_vocab)
## chr [1:2328] "absolutely" "wonderful" "silky" "sexy" "comfortable" "love" ...
mov_tokens <- tokens(myCorpus)
mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE)
fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5))
After that I estimated Word Embedding via Glove. I increased the
default values of the dimensions up to 150 and maximum number of
co-occurrences up to 15. In addition, I put 75 as the number of
interactions. I also took a sum of main and context vectors to get
higher quality embeddings.
glove <- GlobalVectors$new(rank=150, x_max=15)
set.seed(123)
system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 75, convergence_tol = 0.01, n_threads = 1))
wv_context <- glove$components
dim(wv_context)
glove_main <- glove_main + t(wv_context)
Then I created a dataframe out of the Glove results. After that I
defined a plot function for the second and third dimension and created
the plot for several words that I thought could be found in clothes
reviews. From it, we can see some a cluster with words “texture”,
“comfortable” and “cozy”, which makes sense.
glove_dataframe <- as.data.frame(glove_main)
nrow(glove_dataframe)
## [1] 2878
colnames(glove_dataframe )
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
## [11] "V11" "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20"
## [21] "V21" "V22" "V23" "V24" "V25" "V26" "V27" "V28" "V29" "V30"
## [31] "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39" "V40"
## [41] "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49" "V50"
## [51] "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60"
## [61] "V61" "V62" "V63" "V64" "V65" "V66" "V67" "V68" "V69" "V70"
## [71] "V71" "V72" "V73" "V74" "V75" "V76" "V77" "V78" "V79" "V80"
## [81] "V81" "V82" "V83" "V84" "V85" "V86" "V87" "V88" "V89" "V90"
## [91] "V91" "V92" "V93" "V94" "V95" "V96" "V97" "V98" "V99" "V100"
## [101] "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109" "V110"
## [111] "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119" "V120"
## [121] "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129" "V130"
## [131] "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139" "V140"
## [141] "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149" "V150"
glove_dataframe$word <- row.names(glove_dataframe )
plot_words <- function(words, glove_dataframe){
plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n",
xlab="Second dimension", ylab="Third dimension")
for (word in words){
vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3])
text(vector[1], vector[2], labels=word)
}
}
plot_words(c("comfortable", "fit", "quality", "fabric", "size", "trendy", "texture", "cozy"), glove_dataframe)

Afterward I ploted all the dimensions together. The plot is a mess,
but some clusters still can be seen, for instance colors around ~4.5 on
v1 and 0 on v2.
set.seed(123)
system.time(tsne <- Rtsne(glove_main, perplexity = 50))
## пользователь система прошло
## 11.73 0.00 12.40
str(tsne)
## List of 14
## $ N : int 2878
## $ Y : num [1:2878, 1:2] -0.93948 0.01981 0.4322 0.00947 2.84562 ...
## $ costs : num [1:2878] 0.000704 0.000716 0.001199 0.000863 0.001413 ...
## $ itercosts : num [1:20] 74 74 74 74 74 ...
## $ origD : int 50
## $ perplexity : num 50
## $ theta : num 0.5
## $ max_iter : num 1000
## $ stop_lying_iter : int 250
## $ mom_switch_iter : int 250
## $ momentum : num 0.5
## $ final_momentum : num 0.8
## $ eta : num 200
## $ exaggeration_factor: num 12
## - attr(*, "class")= chr [1:2] "Rtsne" "list"
tsne_plot <- tsne$Y
tsne_plot <- as.data.frame(tsne_plot)
str(tsne_plot)
## 'data.frame': 2878 obs. of 2 variables:
## $ V1: num -0.93948 0.01981 0.4322 0.00947 2.84562 ...
## $ V2: num -3.85 -1.32 2.08 -1.22 -4.16 ...
tsne_plot$word <- row.names(glove_main)
str(tsne_plot)
## 'data.frame': 2878 obs. of 3 variables:
## $ V1 : num -0.93948 0.01981 0.4322 0.00947 2.84562 ...
## $ V2 : num -3.85 -1.32 2.08 -1.22 -4.16 ...
## $ word: chr "Absolutely" "wonderful" "silky" "sexy" ...
tsne_plot2 <- ggplot(tsne_plot, aes(x = V1, y = V2, label = word)) + geom_text(size = 3)
tsne_plot2

The see positions of these words i used the following. It also can
be used to see the “closeness” of any words.
tsne_plot[which(tsne_plot$word=="dark"),]
## V1 V2 word
## 430 4.46632 0.6483829 dark
tsne_plot[which(tsne_plot$word=="red"),]
## V1 V2 word
## 907 4.393802 0.06918076 red
Then I computed umap on the entire dataset.
# set.seed(123)
# system.time(glove_umap <- umap(glove_main, n_components = 2, metric = "cosine", n_neighbors = 20, min_dist = 0.1))
# saveRDS(glove_umap, file = "glove_umap.rds")
glove_umap <- readRDS("glove_umap.rds")
df_glove_umap <- as.data.frame(glove_umap$layout)
df_glove_umap$word <- row.names(df_glove_umap)
2D plot without labeling:
ggplot(df_glove_umap) +
geom_point(aes(x = V1, y = V2), colour = 'blue', size = 0.05) +
labs(title = "Word embedding in 2D using UMAP")

Then I looked at some similarities. For the word “cozy” top similar
words make total sense. However from the plot with most close words
which includes not only top 10 words we can see that not all close words
really semantically connected to “cozy”.
cozy<- glove_main["cozy", , drop = F]
cos_sim_cozy <- sim2(x = glove_main, y = cozy, method = "cosine", norm = "l2")
head(sort(cos_sim_cozy[,1], decreasing = T), 10)
## cozy warm super soft Soooo cycle ha cute
## 1.0000000 0.3680378 0.3428902 0.3127455 0.3038562 0.2858981 0.2819521 0.2700653
## flowing supposed
## 0.2655902 0.2634320
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_cozy[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words <- inner_join(x= df_glove_umap , y=select, by= "word")
ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'cozy')) +
geom_point(show.legend = FALSE) +
scale_color_manual(values = c('black', 'red')) +
geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
labs(title = "GloVe word embedding of words related to 'cozy'")

Top 10 words for “quality” are also quite meaningful, but here
almost all the other words that are located close to the words (can be
seen on the plot) are logically connected to it.
quality <- glove_main["quality", , drop = F]
cos_sim_quality <- sim2(x = glove_main, y = quality, method = "cosine", norm = "l2")
head(sort(cos_sim_quality[,1], decreasing = T), 10)
## quality good fabric design great overall material soft
## 1.0000000 0.4723188 0.4612909 0.4170621 0.4001164 0.3932099 0.3885234 0.3421723
## beautiful looks
## 0.3398234 0.3324088
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_quality[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words <- inner_join(x= df_glove_umap , y=select, by= "word")
ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'quality')) +
geom_point(show.legend = FALSE) +
scale_color_manual(values = c('black', 'red')) +
geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
labs(title = "GloVe word embedding of words related to 'quality'")

Situation with “size” is quite similar to “quality”.
size <- glove_main["size", , drop = F]
cos_sim_size <- sim2(x = glove_main, y = size , method = "cosine", norm = "l2")
head(sort(cos_sim_size [,1], decreasing = T), 10)
## size small medium petite ordered xs large true
## 1.0000000 0.6439723 0.5655103 0.5422445 0.5387652 0.5283161 0.5133025 0.4956188
## usual fit
## 0.4917295 0.4900316
select <- data.frame(rownames(as.data.frame(head(sort(cos_sim_size[,1], decreasing = TRUE), 25))))
colnames(select) <- "word"
selected_words <- inner_join(x= df_glove_umap , y=select, by= "word")
ggplot(selected_words, aes(x = V1, y = V2, colour = word == 'size')) +
geom_point(show.legend = FALSE) +
scale_color_manual(values = c('black', 'red')) +
geom_text(aes(V1, V2, label = word), show.legend = FALSE, size = 3.5, vjust=-1.5, hjust=0) +
labs(title = "GloVe word embedding of words related to 'size'")

After that i created a function to compare differences between exact
2 words.
similarity <- function(word1, word2){
lsa::cosine(
x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:150]),
y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:150]))
}
From similarities of the word “quality” I can say that people (at
least in our dataset) more often wrote about good quality rather than
bad one in their reviews.
similarity("quality", "high")
## [,1]
## [1,] 0.2900964
similarity("quality", "low")
## [,1]
## [1,] 0.07108497
similarity("quality", "bad")
## [,1]
## [1,] -0.001152466
similarity("quality", "good")
## [,1]
## [1,] 0.4723188
similarity("quality", "great")
## [,1]
## [1,] 0.4001164
If we look at similarities between “size” and “big” and “small”, we
can see that “small” has higher association with “size” which can mean
that people write more often about getting clothes smaller in size that
getting bigger in size. I think it is quite logical since it is easier
to notice that something is smaller, and if something is bigger one can
think that it is just “oversize”.
similarity("size", "big")
## [,1]
## [1,] 0.3019828
similarity("size", "small")
## [,1]
## [1,] 0.6439723
similarity("size", "fit")
## [,1]
## [1,] 0.4900316
After similarities I looked at some analogies. The first one makes
total sense to me (getting “petite” and “medium” while excluding “big”
from “size” is something very logical).
ex <- glove_main["size", , drop = FALSE] -
glove_main["big", , drop = FALSE] +
glove_main["fit", , drop = FALSE]
cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
## fit size perfectly petite medium
## 0.7632463 0.6944099 0.4802457 0.4793153 0.4555791
When I do (vec)quality- (vec)low+ (vec)soft, I get “fabric”, “super”
and “comfortable”, which again seems quite logical.
ex2 <- glove_main["quality", , drop = FALSE] -
glove_main["low", , drop = FALSE] +
glove_main["soft", , drop = FALSE]
cos_sim_test <- sim2(x = glove_main, y = ex2 , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
## soft quality fabric super comfortable
## 0.7106433 0.6863965 0.5037753 0.4550513 0.4366626
And with (vec)dress- (vec)style+ (vec)cheap I got “comfortable”, and
“maternity” which is not so obvious but quite interesting result.
ex3 <- glove_main["dress", , drop = FALSE] -
glove_main["style", , drop = FALSE] +
glove_main["cheap", , drop = FALSE]
cos_sim_test <- sim2(x = glove_main, y = ex3 , method = "cosine", norm = "l2")
head(sort(cos_sim_test[,1], decreasing = T), 5)
## cheap dress comfortable maternity made
## 0.6611893 0.5001571 0.3395653 0.3181072 0.3076874
After that i did Machine Learning classification with WE.
colnames(glove_dataframe )
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
## [11] "V11" "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20"
## [21] "V21" "V22" "V23" "V24" "V25" "V26" "V27" "V28" "V29" "V30"
## [31] "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39" "V40"
## [41] "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49" "V50"
## [51] "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60"
## [61] "V61" "V62" "V63" "V64" "V65" "V66" "V67" "V68" "V69" "V70"
## [71] "V71" "V72" "V73" "V74" "V75" "V76" "V77" "V78" "V79" "V80"
## [81] "V81" "V82" "V83" "V84" "V85" "V86" "V87" "V88" "V89" "V90"
## [91] "V91" "V92" "V93" "V94" "V95" "V96" "V97" "V98" "V99" "V100"
## [101] "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109" "V110"
## [111] "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119" "V120"
## [121] "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129" "V130"
## [131] "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139" "V140"
## [141] "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149" "V150"
## [151] "word"
glove_dataframe <- select(glove_dataframe, word, everything())
colnames(glove_dataframe )
## [1] "word" "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9"
## [11] "V10" "V11" "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19"
## [21] "V20" "V21" "V22" "V23" "V24" "V25" "V26" "V27" "V28" "V29"
## [31] "V30" "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39"
## [41] "V40" "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49"
## [51] "V50" "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" "V59"
## [61] "V60" "V61" "V62" "V63" "V64" "V65" "V66" "V67" "V68" "V69"
## [71] "V70" "V71" "V72" "V73" "V74" "V75" "V76" "V77" "V78" "V79"
## [81] "V80" "V81" "V82" "V83" "V84" "V85" "V86" "V87" "V88" "V89"
## [91] "V90" "V91" "V92" "V93" "V94" "V95" "V96" "V97" "V98" "V99"
## [101] "V100" "V101" "V102" "V103" "V104" "V105" "V106" "V107" "V108" "V109"
## [111] "V110" "V111" "V112" "V113" "V114" "V115" "V116" "V117" "V118" "V119"
## [121] "V120" "V121" "V122" "V123" "V124" "V125" "V126" "V127" "V128" "V129"
## [131] "V130" "V131" "V132" "V133" "V134" "V135" "V136" "V137" "V138" "V139"
## [141] "V140" "V141" "V142" "V143" "V144" "V145" "V146" "V147" "V148" "V149"
## [151] "V150"
glove_dataframe[1:5, 2:11]
## V1 V2 V3 V4 V5
## Absolutely 0.23676578 0.28588050 -0.09662717 0.3581747 0.4320785
## wonderful -0.08489634 -0.82225524 -0.49909018 0.1338856 -0.3778849
## silky 0.40627422 -0.30104367 0.15406109 0.8227255 -0.3933628
## sexy 0.17379889 -0.04015286 0.01113281 -1.2606894 -0.7595907
## comfortable 0.07595155 0.47015491 0.17285504 -0.5976691 -0.3170695
## V6 V7 V8 V9 V10
## Absolutely -0.04710816 -0.2564801 0.39553601 0.2901069 0.147889982
## wonderful 0.04187202 -0.3162849 0.08928079 0.6025572 0.125961371
## silky -0.74638879 -0.3467772 -0.41713776 -0.3513734 0.243877645
## sexy 0.18669632 0.1729868 0.67040581 -0.2681141 -0.009611733
## comfortable 0.16327063 0.3133347 0.14258458 0.5474783 -0.413188445
nrow(glove_dataframe)
## [1] 2878
ncol(glove_dataframe)
## [1] 151
Since I have 101 columns in the data frame and 150 dimensions of WE
I adjusted the code.
embed <- matrix(NA, nrow=ndoc(Dfm), ncol=150)
for (i in 1:ndoc(Dfm)){
if (i %% 150 == 0) message(i, '/', ndoc(Dfm))
vec <- as.numeric(Dfm[i,])
doc_words <- featnames(Dfm)[vec>0]
embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:151]
embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
if (nrow(embed_vec)==0) embed[i,] <- 0
}
## 150/5000
## 300/5000
## 450/5000
## 600/5000
## 750/5000
## 900/5000
## 1050/5000
## 1200/5000
## 1350/5000
## 1500/5000
## 1650/5000
## 1800/5000
## 1950/5000
## 2100/5000
## 2250/5000
## 2400/5000
## 2550/5000
## 2700/5000
## 2850/5000
## 3000/5000
## 3150/5000
## 3300/5000
## 3450/5000
## 3600/5000
## 3750/5000
## 3900/5000
## 4050/5000
## 4200/5000
## 4350/5000
## 4500/5000
## 4650/5000
## 4800/5000
## 4950/5000
str(embed)
## num [1:5000, 1:150] 0.096 -0.0188 0.0368 0.0463 0.0201 ...
str(tot)
## 'data.frame': 5000 obs. of 3 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ text : chr " Absolutely wonderful - silky and sexy and comfortable" " Love this dress! it s sooo pretty. i happened to find it in a store, and i m glad i did bc i never would hav"| __truncated__ "Some major design flaws I had such high hopes for this dress and really wanted it to work for me. i initially o"| __truncated__ "My favorite buy! I love, love, love this jumpsuit. it s fun, flirty, and fabulous! every time i wear it, i get "| __truncated__ ...
## $ Liked: int 0 1 0 1 1 0 1 0 1 1 ...
prop.table(table(tot$Liked))
##
## 0 1
## 0.4482 0.5518
Then I did Ranger with estimated WE.
source("Function_CV2Ranger.R")
Function_CV2Ranger
class(embed)
ttt <- as.matrix(embed)
str(ttt)
data2 <- data.frame()
str(data2)
k <- 5
set.seed(123)
folds <- cvFolds(NROW(embed), K=k)
str(folds)
class(tot$Liked)
y <- as.factor(tot$Liked)
table(y)
colnames(embed) <- paste0("x",1:ncol(embed))
Ranger_res <- Function_CV2Ranger(input=embed, dt=data2, k=5, DV=y, ML=ranger)
And after that I run BoW model. Since I had colnames in the dfm, I
did not have to add them here.
z = as.matrix(Dfm)
length(Dfm@Dimnames$features)
colnames(z)
Ranger_res2 <- Function_CV2Ranger(input=z, dt=data2, k=5, DV=y, ML=ranger)
Finally, I compared the accuracy of 2 models and CV of BoW turned
out to be better.
colMeans(Ranger_res[ , c(1, 2, 3)])
## Accuracy Avg. Balanced Accuracy Avg. F1
## 0.7318000 0.7229683 0.7244648
colMeans(Ranger_res2[ , c(1, 2, 3)])
## Accuracy Avg. Balanced Accuracy Avg. F1
## 0.7822000 0.7750812 0.7771670